rm(list = ls(all = TRUE))
#library(quanteda)

# DATA AND PATHS
# --------------
inFile <- "./original_data/budget_debates_1999-2013_cleaned.tab"
outFile <- "./generated_data/1-corpus_and_wfm.RData"

# taoiseachs
pmDataFile <- "./original_data/taoiseachs_1999-2013.csv"

# finance ministers 
fmDataFile <- "./original_data/finance_ministers_1983-2013.csv"

# opposition spokesperson
osDataFile <- "./original_data/opposition_spokesperson_1983-2013.csv"


# FUNCTIONS
# ---------
# clean-up member name 
clean.name <- function(x) {
    mname <- as.character(x)
    mname <- sub("RIP","",mname, ignore.case=T)
    mname <- gsub("\\(.+?\\)","",mname)
    mname <- sub("Ó ","Ó",mname, ignore.case=T)
    mname <- sub("Mr. ","",mname, ignore.case=T)    
    mname <- sub("Ms. ","",mname, ignore.case=T)    
    mname <- sub(" "," ",mname, ignore.case=T)    
    return(mname)
}

# retrieve last name
get.lname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "),function(y) y[length(y)])
    return(lnames)
}

# retrieve first name
get.fname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "),function(y) y[1])
    return(lnames)
}


# READ DATA
# ---------
# main data with speeches
speakers <- read.delim(inFile,stringsAsFactors=FALSE)

# taoiseachs
pmData <- read.csv(pmDataFile,header=TRUE)

# finance ministers 
fmData <- read.csv(fmDataFile,header=TRUE)

# opposition spokesperson
osData <- read.csv(osDataFile,header=TRUE)


# ORGANIZE DATA
# -------------
# fix column names
names(speakers)[names(speakers)=="member_name"] <- "member.name"
names(speakers)[names(speakers)=="party_name"] <- "party.name"
names(speakers)[names(speakers)=="const_name"] <- "const.name"
names(speakers)[names(speakers)=="position_numeric"] <- "position.numeric"     
names(speakers)[names(speakers)=="year"] <- "debate.year"

# replace NULL with NA
speakers$position[speakers$position=="NULL"] <- NA
speakers$department[speakers$department=="NULL"] <- NA

# remove duplicate speeches
speakers <- speakers[!duplicated(speakers$speechID),]

# fix party names
speakers$party.name[speakers$party.name=="The Labour Party"] <- "Labour"
speakers$party.name[speakers$party.name=="Green Party"] <- "Green"
speakers$party.name[speakers$party.name=="Socialist Party"] <- "Socialist"
speakers$party.name[speakers$party.name=="The Workers' Party"] <- "Workers' Party"
speakers$party.name[speakers$party.name=="Sinn Féin the Workers' Party"] <- "Workers' Party"

# party name corrections
speakers$party.name[speakers$party.name=="Workers' Party" & speakers$debate.year>=1993] <- "Democratic Left"
speakers$party.name[speakers$party.name=="Democratic Left" & speakers$debate.year>=1999] <- "Labour"
speakers$party.name[speakers$party.name=="Progressive Democrats" & speakers$debate.year>=2010] <- "Fianna Fáil"

# party affiliations corrections
speakers$party.name[speakers$member.name=="Ms. Mary Harney" & speakers$debate.year<=2009] <- "Fianna Fáil"


# VARIABLE CODING
# ---------------
# retrieve first and last names
# -----------------------------
speakers$last.name <- get.lname(as.character(speakers$member.name))
speakers$first.name <- get.fname(as.character(speakers$member.name))
speakers$first.name.abbr <- paste(substr(speakers$first.name,1,1),".",sep="")


# party abbreviations
# -------------------
speakers$party.abbrv <- NA
speakers$party.abbrv[speakers$party.name=="Democratic Left"] <- "DL"
speakers$party.abbrv[speakers$party.name=="Fianna Fáil"] <- "FF"
speakers$party.abbrv[speakers$party.name=="Fine Gael"] <- "FG"
speakers$party.abbrv[speakers$party.name=="Green"] <- "GRE"
speakers$party.abbrv[speakers$party.name=="Independent"] <- "Ind"
speakers$party.abbrv[speakers$party.name=="Labour"] <- "LAB"
speakers$party.abbrv[speakers$party.name=="People Before Profit Alliance"] <- "PBPA"
speakers$party.abbrv[speakers$party.name=="Progressive Democrats"] <- "PD"
speakers$party.abbrv[speakers$party.name=="Sinn Féin"] <- "SF"
speakers$party.abbrv[speakers$party.name=="Socialist"] <- "SOC"


# code party in office
# --------------------
speakers$govt <- 0

# 1998 - 2006: Fianna Fail + Progressive Democrats
speakers$govt[speakers$debate.year>=1998 & speakers$debate.year<=2006 & (speakers$party.name=="Fianna Fáil" | speakers$party.name=="Progressive Democrats")] <- 1
# 2007 - 2010: Fianna Fail + Green + Progressive Democrats
speakers$govt[speakers$debate.year>=2007 & speakers$debate.year<=2010 & (speakers$party.name=="Fianna Fáil" | speakers$party.name=="Progressive Democrats" | speakers$party.name=="Green")] <- 1
# 2011 - 2013: Fine Gael + Labour
speakers$govt[speakers$debate.year>=2011 & speakers$debate.year<=2013 & (speakers$party.name=="Fine Gael" | speakers$party.name=="Labour")] <- 1


# code PM, FM, and OS
# -------------------
# PM
tmp <- pmData[!(names(pmData)=="name")]
tmp$year <- tmp$year-1
tmp$pm <- 1
speakers <- merge(speakers,tmp,by.x=c("debate.year","memberID"),by.y=c("year","memberID"),all.x=TRUE)
speakers$pm[is.na(speakers$pm)] <- 0

# FM
tmp <- fmData[!(names(fmData)=="name")]
tmp$year <- tmp$year-1
tmp$fm <- 1
speakers <- merge(speakers,tmp,by.x=c("debate.year","memberID"),by.y=c("year","memberID"),all.x=TRUE)
speakers$fm[is.na(speakers$fm)] <- 0

# OS
tmp <- osData[!(names(osData)=="name")]
tmp$year <- tmp$year-1
tmp$os <- 1
speakers <- merge(speakers,tmp,by.x=c("debate.year","memberID"),by.y=c("year","memberID"),all.x=TRUE)
speakers$os[is.na(speakers$os)] <- 0

# data corrections
# ----------------
speakers$party.abbrv[speakers$member.name=="Ms. Mary Harney" & speakers$debate.year<2009] <- "PD"

speakers$party.abbrv[speakers$member.name=="Mr. Martin Cullen (Resigned)"] <- "FF"

# for budget years 2012 and 2013, combine Finance Minister Department with Reform Department because both oversaw budget decisons
fmID <- unique(speakers$memberID[speakers$budget_year==2012 & speakers$fm==1]) # note: same FM in 2012 and 2013
speakers$memberID[which(speakers$department=="Public Expenditure and Reform" & speakers$budget_year %in% c(2012, 2013))] <- fmID

# only keep cabinet members
speakers <- speakers[!is.na(speakers$position),]


# GENERATE CORPUS AND WFM
# -----------------------
# create lists that will hold the corpus and wfm object for each budget year
corpus.list <- list()
wfm.list <- list()

for (i in sort(unique(speakers$debate.year))) {
    # print current year
    cat(paste("Budget debate",i,"\n"))

    # generate corpus   
    corpus.list[[as.character(i)]] <- corpus(
        speakers[speakers$debate.year==i,]$speech,
        docvars=speakers[speakers$debate.year==i,][names(speakers[speakers$debate.year==i,])=="memberID"])

    # generate WFM
    toks <- tokens(corpus.list[[as.character(i)]], what = "word",
                   remove_numbers = TRUE, 
                   remove_punct = TRUE,
                   remove_symbols = TRUE,
                   remove_twitter = TRUE,
                   remove_hyphens = TRUE,
                   remove_url = TRUE,
                   verbose = TRUE)
    
    wfm.list[[as.character(i)]] <- dfm(toks,
                         groups = c("memberID"),
                         tolower = TRUE,
                         remove=stopwords("english"),
                         stem=TRUE, 
                         verbose = TRUE)                            
    # trim
    wfm.list[[as.character(i)]] <- dfm_trim(wfm.list[[as.character(i)]], min_termfreq = 1, min_docfreq = 2)
    
    # remove documents with less/equal than k words because of interruptions
    # that are being combined into a document but are not real speeches
    k <- 50
    wfm.list[[as.character(i)]] <- wfm.list[[as.character(i)]][rowSums(wfm.list[[as.character(i)]]) > k,]
    
}


# GENERATE DATA WITH SPEAKER ATTRIBUTES
# -------------------------------------
# generate data file with unique debate-speaker observations
data <- speakers[,!(names(speakers)=="speech")]
data <- data[order(data$debate.year, -data$fm),] # order to make sure that FM is not removed when speeches were combined in budget years 2012 and 2013
data <- data[!duplicated(paste(data$debate.year,data$memberID)),]

# generate year-memberID identifier that matches rownames of WFM
data$yearMemberID <- paste(as.character(data$debate.year),as.character(data$memberID),sep=".")

# remove speakers from data frame that appear not in WFM
#data <- data[(data$yearMemberID %in%rownames(wfm)),]

# no PM speech in 2000. replace with Tánaiste
data$pm[which(data$debate.year==2000 & data$position=="Tánaiste")] <- 1

# save image
save(corpus.list,wfm.list,data,speakers,file=outFile)
